library(devtools)
library(arrow)
library(foreign)
library(conText) #install_github("prodriguezsosa/conText")
library(dplyr)
library(ggplot2)
library(text2vec)
library(tidytext)
library(quanteda)
library(stringr)
library(forcats)
library(gridExtra)
library(xtable)
library(boot)
library(boot.pval)

# set path to working directory
if(Sys.info()['user']==''){path_to_data<-""} #set filepath for replication data folder
setwd(path_to_data)
# set path to save files
savedfilepath <- '' #set filepath for objects to save


# Load the data
pre_trained <- readRDS(paste0(path_to_data, 'glove.rds')) # GloVe embeddings
transform_matrix <- readRDS(paste0(path_to_data, 'khodakA.rds')) # transformation matrix
ken_oped <- read.csv('ken_oped_data.csv') # Kenyan media corpus

# format date and month variables
# create month variable
create_year_month <- function(data){
  data <- data %>% 
    group_by(month = lubridate::floor_date(date, "month"))
  return(data)
}

ken_oped$date <- as.Date(ken_oped$date, '%Y-%m-%d')
ken_oped$year <- format(ken_oped$date, '%Y')
ken_oped <- create_year_month(ken_oped)
ken_oped$month

#### Term Frequency Over Time ####

# function to calculate number of articles per year featuring specified terms 
 
term_frequency <- function(phrase){
  
  mentions <- grepl(phrase, ken_oped$body, ignore.case = T)
  mentions <- as.data.frame(mentions)
  
  colnames(mentions) <- 'mentions'
  ken2 <- cbind(ken_oped, mentions)
  
  ken_mentions <- ken2 %>%
    group_by(month) %>%
    summarize(freq=sum(mentions))
  
  the_plot <- ggplot(ken_mentions, aes(x=month, y=freq))+
    geom_bar(stat='identity', width=20, alpha=.5)+
    scale_x_date(date_breaks = "1 year", date_labels='%Y')+
    theme_minimal()+
    geom_vline(xintercept = as.Date("2010-03-01"), linetype=4, size=1, color='red')+
    geom_vline(xintercept = as.Date("2016-04-01"), linetype=4, size=1, color='red')+
    annotate('text', x=as.Date("2007-12-31"), y = 30, label= 'ICC\ninvestigations\nbegin in Kenya', color='red', size=7)+
    annotate('text', x=as.Date("2018-02-01"), y = 30, label= 'Final case\nterminated', color='red', size=7)+
    theme(axis.text.y = element_text(angle = 0, size=20, vjust = .5),
          axis.text.x = element_text(angle = 45, size=20, vjust = .5),
          axis.title = element_text(size=20),
          plot.margin = margin(0, 2, 0, 1, "cm"))+
    labs(y='Articles per Month', x="Date")
  
  return(the_plot)
}

elite_mentions <- '(deputy president|president|kenyatta|ruto) (said|says|argued|stated)|(said|says|argued|stated) (kenyatta|ruto|the deputy president|the president)'

#### Generate Figure 2: How often were elites quoted/mentioned ####
term_frequency(elite_mentions)

ggsave(
       filename = 'figure_2.jpg', 
       width=12, height=7,
       path=savedfilepath, device = 'jpg')


#### Estimating Diachronic Cosine Similarity Scores ####

# custom stopword list
custom_stop_words = c('a', 'am', 'an', 'at','as','are', 'and', 'any', 'all',
                      'be', 'because', 'been', 'being', 'both', 'but', 'by',
                      'can', 'cannot', 'could', 'did', 'do', 'does', 'doing',
                      'few', 'for', 'from', 'had', 'has', 'have', 'how',
                      'he', 'she', 'her', 'herself', 'him', 'himself', 'his', 'they', 'them', 'ours', 'our', 'their', 'theirs', 'themselves', 'you', 'your', 'yours','yourself', 'yourselves', 'myself', 'my',
                      'is', 'it', 'itself',
                      'more', 'most', 'me', 'mine', 'must', 
                      'until', 'nonetheless', 'nothing', 'never', 'neither', 'nobody', 'nowhere',
                      'many', 'few', 'some', 'little', 'lot', 'general', 'specific', 'whom', 'been', 'being', 'having', 'does', 'did', 'doing',  
                      'about', 'against', 'between', 'into', 'through', 'during', 
                      'before', 'after', 'above', 'below', 'from', 'down', 
                      'off', 'over', 'under', 'again', 'further', 'once',
                      'not', 'nor', 'no', 'none', 'noone', 'neither',
                      'only', 'off', 'once', 'only', 'own', 'out', 'other',
                      'said', 'say', 'says', 'same', 'should', 'such',
                      'that', 'the', 'then', 'than', 'this', 'those', 'through', 'to', 'too','these',
                      'was', 'were', 'will', 'wont', 'when', 'where', 'who', 'what', 'while', 'whom', 'why', 'with', 'would')


#### baseline function used for remaining models ####
# computes monthly estimates for cosine similarity between anchor, keyword set

get_cosines <- function(df, anchor, word_features, pre_trained, transform_matrix, window_size){
  
  set.seed(23188)
  
  the_corpus <- corpus(df, 
                       docid_field = 'url', text_field = 'body', 
                       meta = c('year', 'month', 'opinion', 'publisher')) #create corpus object from dataframe
  
  the_toks <- tokens(the_corpus, remove_punct = T, remove_numbers = T,remove_symbols = T,
                     remove_separators = T,include_docvars = T) #tokenize corpus
  
  the_toks <- tokens_tolower(the_toks) #pre-processing tokens
  
  the_toks <- tokens_select(the_toks, pattern = custom_stop_words, selection = "remove", min_nchar = 3,
                            padding = TRUE) #pre-processing tokens
  
  toks_cont <- tokens_context(the_toks, 
                              pattern = anchor, 
                              window = window_size, case_insensitive = T) #create tokens context using specified window size
  
  results <- get_cos_sim(toks_cont, 
                         groups = docvars(toks_cont, 'month'),
                         features=word_features,
                         pre_trained=pre_trained,
                         transform=T,
                         transform_matrix=transform_matrix,
                         bootstrap=T,
                         num_bootstraps=100,
                         as_list=FALSE) 
  
  results <- rename(results, date=target)
  results$date <- as.Date(results$date)
  
  return(results)
}



word_features <- c('politicize', 'persecute', 'victimize', 'bias', 'discriminate', 'unfair')

results <- get_cosines(ken_oped, 
                       word_features = word_features,
                       pre_trained = pre_trained,
                       transform_matrix = transform_matrix,
                       anchor = c('icc'),
                       window_size = 24L)


# group by date, calculate mean cosine similarity within group
agg_results <- 
  results %>%
  group_by(date) %>% 
  summarize(value = mean(value))


#### Generate Figure 3: Aggregated Keyword Trends ####
ggplot(agg_results,
       aes(x=date, y=value)) + 
  geom_point(size=0.5)+
  geom_smooth(span = .15, inherit.aes=T)+
  geom_hline(yintercept=0, linetype="dotted", color = "black", size=0.5) + 
  labs(y='Cosine Similarity Score', 
       fill='Keyword', title='Cosine Similarity Score between "icc" and political keywords', x="Date of Publication (Monthly Estimates)")+
  scale_colour_viridis_d()+
  scale_x_date(date_breaks = "1 year", date_labels='%Y')+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, size=16), 
        axis.text.y = element_text(size=16), 
        axis.title = element_text(size=16), 
        title = element_text(size=16),
        strip.text = element_text(size=16), legend.position='none')

ggsave(
  filename = 'figure_3.jpg',
  path=savedfilepath,
  device = 'jpg', 
  width = 14, height=8)


#### CPD Algorithm ####
#### Algorithm used to estimate change point(s) ####

# first step: calculate cumulative sums
all_cum_sum <- data.frame(date = agg_results$date, cum_sums = cumsum(agg_results$value-mean(agg_results$value)))
cp1 <- all_cum_sum$date[which.max(abs(all_cum_sum$cum_sums))] #CUSUM ESTIMATOR

# plot cumulative sums
changepoint1 <- ggplot(all_cum_sum, aes(x=date, y=cum_sums))+
  geom_point(size=0.5)+
  geom_line()+
  geom_hline(yintercept=0, linetype="dotted", color = "black", size=0.5) + 
  annotate('text',  x=as.Date("2015-12-01"), y = 0.25, label= "S[m]", color='red', size=4, parse=T)+
  geom_vline(xintercept = cp1, linetype='dashed', color='red')+ #cusum changepoint estimator
  labs(title='Changepoint 1', x="", y="Cumulative Sums")+theme_minimal()+
  theme(axis.text.x = element_text(angle = 0, size=12), 
        axis.text.y = element_text(size=12), 
        axis.title = element_text(size=14),
        title = element_text(size=14),
        strip.text = element_text(size=14))

# Simulate CUSUM to test for significance

# cusum estimator function
s_diff_fun <- function(d, inds){
  reordered <- d[inds,]$value
  cum_sums = cumsum(reordered-mean(reordered))
  sim_s_diff <- max(cum_sums)-min(cum_sums)
  return(sim_s_diff)
}

set.seed(20010)
bootdiff <- boot(data = agg_results, statistic = s_diff_fun, R = 10000)
booted_t <- as.data.frame(bootdiff$t) 
boot.pval(bootdiff, type='perc', theta_null = bootdiff$t0)

#### Generate Figure 4: CPD ####

booted1 <- ggplot(booted_t)+
  geom_histogram(mapping = aes(V1), bins = 100, color="black", fill="white") + 
  geom_vline(xintercept =  bootdiff$t0, color='red', linetype='dashed')+
  annotate('text', x=bootdiff$t0 + .05, y=50, label='S[diff]', parse=T, color='red', size=4)+
  labs(x='t-stat', y='frequency', title='Bootstrapped t-stat')+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 0, size=12), 
        axis.text.y = element_text(size=12), 
        axis.title = element_text(size=14),
        title = element_text(size=14),
        strip.text = element_text(size=14))


p <- grid.arrange(changepoint1, booted1, nrow=1)

ggsave(filename='figure_4.jpg', plot = p, width = 8, height=4,
       path=savedfilepath,
       device = 'jpg')
